# Load necessary libraries
library(dplyr) # For data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr) # For printing output nicely
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(tidyr)
library(visdat)
library(arrow)
##
## Attaching package: 'arrow'
## The following object is masked from 'package:utils':
##
## timestamp
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:arrow':
##
## duration
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2) # Fo
# Read the CSV file
df_raw <- read.csv("Traffic_Crashes_-_Crashes_20240302.csv")
# Print column names
print(colnames(df_raw))
## [1] "CRASH_RECORD_ID" "CRASH_DATE_EST_I"
## [3] "CRASH_DATE" "POSTED_SPEED_LIMIT"
## [5] "TRAFFIC_CONTROL_DEVICE" "DEVICE_CONDITION"
## [7] "WEATHER_CONDITION" "LIGHTING_CONDITION"
## [9] "FIRST_CRASH_TYPE" "TRAFFICWAY_TYPE"
## [11] "LANE_CNT" "ALIGNMENT"
## [13] "ROADWAY_SURFACE_COND" "ROAD_DEFECT"
## [15] "REPORT_TYPE" "CRASH_TYPE"
## [17] "INTERSECTION_RELATED_I" "NOT_RIGHT_OF_WAY_I"
## [19] "HIT_AND_RUN_I" "DAMAGE"
## [21] "DATE_POLICE_NOTIFIED" "PRIM_CONTRIBUTORY_CAUSE"
## [23] "SEC_CONTRIBUTORY_CAUSE" "STREET_NO"
## [25] "STREET_DIRECTION" "STREET_NAME"
## [27] "BEAT_OF_OCCURRENCE" "PHOTOS_TAKEN_I"
## [29] "STATEMENTS_TAKEN_I" "DOORING_I"
## [31] "WORK_ZONE_I" "WORK_ZONE_TYPE"
## [33] "WORKERS_PRESENT_I" "NUM_UNITS"
## [35] "MOST_SEVERE_INJURY" "INJURIES_TOTAL"
## [37] "INJURIES_FATAL" "INJURIES_INCAPACITATING"
## [39] "INJURIES_NON_INCAPACITATING" "INJURIES_REPORTED_NOT_EVIDENT"
## [41] "INJURIES_NO_INDICATION" "INJURIES_UNKNOWN"
## [43] "CRASH_HOUR" "CRASH_DAY_OF_WEEK"
## [45] "CRASH_MONTH" "LATITUDE"
## [47] "LONGITUDE" "LOCATION"
# Print the number of rows and columns
cat("Number of rows:", nrow(df_raw), "\n")
## Number of rows: 810658
cat("Number of columns:", ncol(df_raw), "\n")
## Number of columns: 48
# Display information about the dataset
str(df_raw)
## 'data.frame': 810658 obs. of 48 variables:
## $ CRASH_RECORD_ID : chr "6c1659069e9c6285a650e70d6f9b574ed5f64c12888479093dfeef179c0344ec6d2057eae224b5c0d5dfc278c0a237f8c22543f07fdef2e"| __truncated__ "5f54a59fcb087b12ae5b1acff96a3caf4f2d37e79f8db4106558b34b8a6d2b81af02cf91b576ecd7ced08ffd10fcfd940a84f7613125b89"| __truncated__ "61fcb8c1eb522a6469b460e2134df3d15f82e81fd93e9cafd3dc7e631b9e1ba8b450a63af12bd90d1d2d9b127ea287f88d32e138a4eeba1"| __truncated__ "004cd14d0303a9163aad69a2d7f341b7da2a8572b2ab3378594bfae8ac53dcb604dd8d414f93c290b55862f9f2517ad32e6209cbc8034c2"| __truncated__ ...
## $ CRASH_DATE_EST_I : chr "" "" "" "" ...
## $ CRASH_DATE : chr "08/18/2023 12:50:00 PM" "07/29/2023 02:45:00 PM" "08/18/2023 05:58:00 PM" "11/26/2019 08:38:00 AM" ...
## $ POSTED_SPEED_LIMIT : int 15 30 30 25 20 30 30 35 30 25 ...
## $ TRAFFIC_CONTROL_DEVICE : chr "OTHER" "TRAFFIC SIGNAL" "NO CONTROLS" "NO CONTROLS" ...
## $ DEVICE_CONDITION : chr "FUNCTIONING PROPERLY" "FUNCTIONING PROPERLY" "NO CONTROLS" "NO CONTROLS" ...
## $ WEATHER_CONDITION : chr "CLEAR" "CLEAR" "CLEAR" "CLEAR" ...
## $ LIGHTING_CONDITION : chr "DAYLIGHT" "DAYLIGHT" "DAYLIGHT" "DAYLIGHT" ...
## $ FIRST_CRASH_TYPE : chr "REAR END" "PARKED MOTOR VEHICLE" "PEDALCYCLIST" "PEDESTRIAN" ...
## $ TRAFFICWAY_TYPE : chr "OTHER" "DIVIDED - W/MEDIAN (NOT RAISED)" "NOT DIVIDED" "ONE-WAY" ...
## $ LANE_CNT : chr "" "" "" "" ...
## $ ALIGNMENT : chr "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "CURVE ON GRADE" ...
## $ ROADWAY_SURFACE_COND : chr "DRY" "DRY" "DRY" "DRY" ...
## $ ROAD_DEFECT : chr "NO DEFECTS" "NO DEFECTS" "NO DEFECTS" "NO DEFECTS" ...
## $ REPORT_TYPE : chr "ON SCENE" "ON SCENE" "ON SCENE" "ON SCENE" ...
## $ CRASH_TYPE : chr "INJURY AND / OR TOW DUE TO CRASH" "NO INJURY / DRIVE AWAY" "INJURY AND / OR TOW DUE TO CRASH" "INJURY AND / OR TOW DUE TO CRASH" ...
## $ INTERSECTION_RELATED_I : chr "" "" "" "" ...
## $ NOT_RIGHT_OF_WAY_I : chr "" "" "" "" ...
## $ HIT_AND_RUN_I : chr "" "Y" "" "" ...
## $ DAMAGE : chr "OVER $1,500" "OVER $1,500" "$501 - $1,500" "OVER $1,500" ...
## $ DATE_POLICE_NOTIFIED : chr "08/18/2023 12:55:00 PM" "07/29/2023 02:45:00 PM" "08/18/2023 06:01:00 PM" "11/26/2019 08:38:00 AM" ...
## $ PRIM_CONTRIBUTORY_CAUSE : chr "FOLLOWING TOO CLOSELY" "FAILING TO REDUCE SPEED TO AVOID CRASH" "FAILING TO REDUCE SPEED TO AVOID CRASH" "UNABLE TO DETERMINE" ...
## $ SEC_CONTRIBUTORY_CAUSE : chr "DISTRACTION - FROM INSIDE VEHICLE" "OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER" "UNABLE TO DETERMINE" "NOT APPLICABLE" ...
## $ STREET_NO : int 700 2101 3422 5 3 1732 2 9000 5900 4546 ...
## $ STREET_DIRECTION : chr "W" "S" "N" "W" ...
## $ STREET_NAME : chr "OHARE ST" "ASHLAND AVE" "LONG AVE" "TERMINAL ST" ...
## $ BEAT_OF_OCCURRENCE : int 1654 1235 1633 1655 1653 1814 1652 2221 232 1131 ...
## $ PHOTOS_TAKEN_I : chr "" "" "" "Y" ...
## $ STATEMENTS_TAKEN_I : chr "" "" "" "Y" ...
## $ DOORING_I : chr "" "" "" "" ...
## $ WORK_ZONE_I : chr "" "" "" "" ...
## $ WORK_ZONE_TYPE : chr "" "" "" "" ...
## $ WORKERS_PRESENT_I : chr "" "" "" "" ...
## $ NUM_UNITS : int 2 4 2 2 1 2 2 2 2 2 ...
## $ MOST_SEVERE_INJURY : chr "NONINCAPACITATING INJURY" "NO INDICATION OF INJURY" "NONINCAPACITATING INJURY" "FATAL" ...
## $ INJURIES_TOTAL : int 1 0 1 1 0 0 0 0 0 0 ...
## $ INJURIES_FATAL : int 0 0 0 1 0 0 0 0 0 0 ...
## $ INJURIES_INCAPACITATING : int 0 0 0 0 0 0 0 0 0 0 ...
## $ INJURIES_NON_INCAPACITATING : int 1 0 1 0 0 0 0 0 0 0 ...
## $ INJURIES_REPORTED_NOT_EVIDENT: int 0 0 0 0 0 0 0 0 0 0 ...
## $ INJURIES_NO_INDICATION : int 1 1 1 1 1 3 2 2 2 5 ...
## $ INJURIES_UNKNOWN : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CRASH_HOUR : int 12 14 17 8 10 13 17 13 0 19 ...
## $ CRASH_DAY_OF_WEEK : int 6 7 6 3 6 7 2 1 1 2 ...
## $ CRASH_MONTH : int 8 7 8 11 8 7 2 8 8 1 ...
## $ LATITUDE : num NA 41.9 41.9 NA NA ...
## $ LONGITUDE : num NA -87.7 -87.8 NA NA ...
## $ LOCATION : chr "" "POINT (-87.665902342962 41.854120262952)" "POINT (-87.761883496974 41.942975745006)" "" ...
# Rename columns
df_1 <- df_raw %>%
rename(inj_non_incap = INJURIES_NON_INCAPACITATING,
inj_report_not_evdnt = INJURIES_REPORTED_NOT_EVIDENT)
# Drop unwanted columns
df_1 <- df_raw %>%
select(-CRASH_RECORD_ID, -CRASH_DATE_EST_I, -LANE_CNT, -LOCATION, -REPORT_TYPE, -DATE_POLICE_NOTIFIED,
-PHOTOS_TAKEN_I, -STATEMENTS_TAKEN_I, -DOORING_I, -WORK_ZONE_I, -WORK_ZONE_TYPE, -WORKERS_PRESENT_I)
# Assuming df_1 is your dataframe
df_c <- df_1
colnames(df_c)
## [1] "CRASH_DATE" "POSTED_SPEED_LIMIT"
## [3] "TRAFFIC_CONTROL_DEVICE" "DEVICE_CONDITION"
## [5] "WEATHER_CONDITION" "LIGHTING_CONDITION"
## [7] "FIRST_CRASH_TYPE" "TRAFFICWAY_TYPE"
## [9] "ALIGNMENT" "ROADWAY_SURFACE_COND"
## [11] "ROAD_DEFECT" "CRASH_TYPE"
## [13] "INTERSECTION_RELATED_I" "NOT_RIGHT_OF_WAY_I"
## [15] "HIT_AND_RUN_I" "DAMAGE"
## [17] "PRIM_CONTRIBUTORY_CAUSE" "SEC_CONTRIBUTORY_CAUSE"
## [19] "STREET_NO" "STREET_DIRECTION"
## [21] "STREET_NAME" "BEAT_OF_OCCURRENCE"
## [23] "NUM_UNITS" "MOST_SEVERE_INJURY"
## [25] "INJURIES_TOTAL" "INJURIES_FATAL"
## [27] "INJURIES_INCAPACITATING" "INJURIES_NON_INCAPACITATING"
## [29] "INJURIES_REPORTED_NOT_EVIDENT" "INJURIES_NO_INDICATION"
## [31] "INJURIES_UNKNOWN" "CRASH_HOUR"
## [33] "CRASH_DAY_OF_WEEK" "CRASH_MONTH"
## [35] "LATITUDE" "LONGITUDE"
# Assuming df_c is your dataframe
df_c <- df_c %>%
mutate(INTERSECTION_RELATED_I = if_else(is.na(INTERSECTION_RELATED_I), 'N', INTERSECTION_RELATED_I),
NOT_RIGHT_OF_WAY_I = if_else(is.na(NOT_RIGHT_OF_WAY_I), 'N', NOT_RIGHT_OF_WAY_I),
HIT_AND_RUN_I = if_else(is.na(HIT_AND_RUN_I), 'N', HIT_AND_RUN_I))
# Set columns to 0 where injuries_total is missing and crash_type is 'NO INJURY / DRIVE AWAY'
df_c <- df_c %>%
mutate(INJURIES_TOTAL = ifelse(is.na(INJURIES_TOTAL) & CRASH_TYPE == 'NO INJURY / DRIVE AWAY', 0, INJURIES_TOTAL),
INJURIES_FATAL = ifelse(is.na(INJURIES_TOTAL) & CRASH_TYPE == 'NO INJURY / DRIVE AWAY', 0, INJURIES_FATAL),
INJURIES_INCAPACITATING = ifelse(is.na(INJURIES_TOTAL) & CRASH_TYPE == 'NO INJURY / DRIVE AWAY', 0, INJURIES_INCAPACITATING),
INJURIES_NON_INCAPACITATING = ifelse(is.na(INJURIES_TOTAL) & CRASH_TYPE == 'NO INJURY / DRIVE AWAY', 0, INJURIES_NON_INCAPACITATING),
INJURIES_REPORTED_NOT_EVIDENT = ifelse(is.na(INJURIES_TOTAL) & CRASH_TYPE == 'NO INJURY / DRIVE AWAY', 0, INJURIES_REPORTED_NOT_EVIDENT),
INJURIES_NO_INDICATION = ifelse(is.na(INJURIES_TOTAL) & CRASH_TYPE == 'NO INJURY / DRIVE AWAY', 0, INJURIES_NO_INDICATION),
INJURIES_UNKNOWN = ifelse(is.na(INJURIES_TOTAL) & CRASH_TYPE == 'NO INJURY / DRIVE AWAY', 0, INJURIES_UNKNOWN))
# Drop rows where injuries_total is missing
df_c <- df_c %>%
filter(!is.na(INJURIES_TOTAL))
# wherever most_severe_injury is missing and injuries_total is 0
df_c <- df_c %>%
mutate(MOST_SEVERE_INJURY = ifelse(is.na(MOST_SEVERE_INJURY) & INJURIES_TOTAL == 0,
'NO INDICATION OF INJURY', MOST_SEVERE_INJURY))
# Print table of MOST_SEVERE_INJURY values
print(as.data.frame(table(df_c$MOST_SEVERE_INJURY, useNA = "ifany")))
## Var1 Freq
## 1 1007
## 2 FATAL 889
## 3 INCAPACITATING INJURY 13731
## 4 NO INDICATION OF INJURY 697464
## 5 NONINCAPACITATING INJURY 62887
## 6 REPORTED, NOT EVIDENT 33906
ggplot(df_c, aes(x = MOST_SEVERE_INJURY)) +
geom_bar() + # Add bars
labs(x = "MOST_SEVERE_INJURY", y = "Number of Records", title = "Number of Records per Category") + # Add labels and title
theme_minimal() + # Minimal theme
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Convert CRASH_DATE to datetime and create crash_year column
df_c <- df_c %>%
mutate(CRASH_DATE = mdy_hms(CRASH_DATE),
crash_year = year(CRASH_DATE))
# Create address column
df_c <- df_c %>%
mutate(
STREET_NO = as.character(STREET_NO),
STREET_DIRECTION = as.character(STREET_DIRECTION),
STREET_NAME = as.character(STREET_NAME),
address = paste0(STREET_NO, " ", STREET_DIRECTION, " ", STREET_NAME)
)
# Assuming df_c is your dataframe
df_c2 <- df_c
# Using base R unique function to drop duplicates
df_c2 <- unique(df_c)
# Using dplyr's distinct function
df_c2 <- distinct(df_c)
# Drop rows that have both latitude and longitude == 0
df_c2 <- df_c2 %>%
filter(LATITUDE != 0 & LONGITUDE != 0)
# Convert columns to integer type
df_c2 <- df_c2 %>%
mutate(
BEAT_OF_OCCURRENCE = as.integer(BEAT_OF_OCCURRENCE),
INJURIES_TOTAL = as.integer(INJURIES_TOTAL),
INJURIES_FATAL = as.integer(INJURIES_FATAL),
INJURIES_INCAPACITATING = as.integer(INJURIES_INCAPACITATING),
INJURIES_NON_INCAPACITATING = as.integer(INJURIES_NON_INCAPACITATING),
INJURIES_REPORTED_NOT_EVIDENT = as.integer(INJURIES_REPORTED_NOT_EVIDENT),
INJURIES_NO_INDICATION = as.integer(INJURIES_NO_INDICATION)
)
# Drop columns "not_right_of_way_i" and "injuries_unknown"
df_c2 <- df_c2 %>%
select(-NOT_RIGHT_OF_WAY_I, -INJURIES_UNKNOWN)
# Inspect the structure of the dataframe
str(df_c2)
## 'data.frame': 804227 obs. of 36 variables:
## $ CRASH_DATE : POSIXct, format: "2023-07-29 14:45:00" "2023-08-18 17:58:00" ...
## $ POSTED_SPEED_LIMIT : int 30 30 10 30 15 30 30 30 30 20 ...
## $ TRAFFIC_CONTROL_DEVICE : chr "TRAFFIC SIGNAL" "NO CONTROLS" "NO CONTROLS" "TRAFFIC SIGNAL" ...
## $ DEVICE_CONDITION : chr "FUNCTIONING PROPERLY" "NO CONTROLS" "NO CONTROLS" "FUNCTIONING PROPERLY" ...
## $ WEATHER_CONDITION : chr "CLEAR" "CLEAR" "UNKNOWN" "CLEAR" ...
## $ LIGHTING_CONDITION : chr "DAYLIGHT" "DAYLIGHT" "UNKNOWN" "DARKNESS" ...
## $ FIRST_CRASH_TYPE : chr "PARKED MOTOR VEHICLE" "PEDALCYCLIST" "ANGLE" "SIDESWIPE OPPOSITE DIRECTION" ...
## $ TRAFFICWAY_TYPE : chr "DIVIDED - W/MEDIAN (NOT RAISED)" "NOT DIVIDED" "PARKING LOT" "NOT DIVIDED" ...
## $ ALIGNMENT : chr "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" ...
## $ ROADWAY_SURFACE_COND : chr "DRY" "DRY" "UNKNOWN" "UNKNOWN" ...
## $ ROAD_DEFECT : chr "NO DEFECTS" "NO DEFECTS" "UNKNOWN" "UNKNOWN" ...
## $ CRASH_TYPE : chr "NO INJURY / DRIVE AWAY" "INJURY AND / OR TOW DUE TO CRASH" "NO INJURY / DRIVE AWAY" "NO INJURY / DRIVE AWAY" ...
## $ INTERSECTION_RELATED_I : chr "" "" "" "" ...
## $ HIT_AND_RUN_I : chr "Y" "" "Y" "Y" ...
## $ DAMAGE : chr "OVER $1,500" "$501 - $1,500" "$501 - $1,500" "OVER $1,500" ...
## $ PRIM_CONTRIBUTORY_CAUSE : chr "FAILING TO REDUCE SPEED TO AVOID CRASH" "FAILING TO REDUCE SPEED TO AVOID CRASH" "UNABLE TO DETERMINE" "UNABLE TO DETERMINE" ...
## $ SEC_CONTRIBUTORY_CAUSE : chr "OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER" "UNABLE TO DETERMINE" "NOT APPLICABLE" "NOT APPLICABLE" ...
## $ STREET_NO : chr "2101" "3422" "1320" "999" ...
## $ STREET_DIRECTION : chr "S" "N" "E" "N" ...
## $ STREET_NAME : chr "ASHLAND AVE" "LONG AVE" "47TH ST" "CALIFORNIA AVE" ...
## $ BEAT_OF_OCCURRENCE : int 1235 1633 222 1211 412 731 1712 1811 1214 2525 ...
## $ NUM_UNITS : int 4 2 2 2 2 2 3 2 2 2 ...
## $ MOST_SEVERE_INJURY : chr "NO INDICATION OF INJURY" "NONINCAPACITATING INJURY" "NO INDICATION OF INJURY" "NO INDICATION OF INJURY" ...
## $ INJURIES_TOTAL : int 0 1 0 0 0 5 0 0 0 0 ...
## $ INJURIES_FATAL : int 0 0 0 0 0 0 0 0 0 0 ...
## $ INJURIES_INCAPACITATING : int 0 0 0 0 0 0 0 0 0 0 ...
## $ INJURIES_NON_INCAPACITATING : int 0 1 0 0 0 5 0 0 0 0 ...
## $ INJURIES_REPORTED_NOT_EVIDENT: int 0 0 0 0 0 0 0 0 0 0 ...
## $ INJURIES_NO_INDICATION : int 1 1 2 2 2 0 4 2 3 1 ...
## $ CRASH_HOUR : int 14 17 14 0 12 19 10 18 14 12 ...
## $ CRASH_DAY_OF_WEEK : int 7 6 7 7 4 4 1 4 7 1 ...
## $ CRASH_MONTH : int 7 8 7 7 9 8 8 9 8 7 ...
## $ LATITUDE : num 41.9 41.9 41.8 41.9 41.7 ...
## $ LONGITUDE : num -87.7 -87.8 -87.6 -87.7 -87.6 ...
## $ crash_year : num 2023 2023 2023 2023 2023 ...
## $ address : chr "2101 S ASHLAND AVE" "3422 N LONG AVE" "1320 E 47TH ST" "999 N CALIFORNIA AVE" ...
#write_parquet(df_c2, file_parquet_c)
write_parquet(df_c2, "Intmd_data.parquet")
# Read Parquet file into a dataframe
crash_df <- arrow::read_parquet("Intmd_data.parquet")
# Inspect the structure of the dataframe
str(crash_df)
## tibble [804,227 × 36] (S3: tbl_df/tbl/data.frame)
## $ CRASH_DATE : POSIXct[1:804227], format: "2023-07-29 14:45:00" "2023-08-18 17:58:00" ...
## $ POSTED_SPEED_LIMIT : int [1:804227] 30 30 10 30 15 30 30 30 30 20 ...
## $ TRAFFIC_CONTROL_DEVICE : chr [1:804227] "TRAFFIC SIGNAL" "NO CONTROLS" "NO CONTROLS" "TRAFFIC SIGNAL" ...
## $ DEVICE_CONDITION : chr [1:804227] "FUNCTIONING PROPERLY" "NO CONTROLS" "NO CONTROLS" "FUNCTIONING PROPERLY" ...
## $ WEATHER_CONDITION : chr [1:804227] "CLEAR" "CLEAR" "UNKNOWN" "CLEAR" ...
## $ LIGHTING_CONDITION : chr [1:804227] "DAYLIGHT" "DAYLIGHT" "UNKNOWN" "DARKNESS" ...
## $ FIRST_CRASH_TYPE : chr [1:804227] "PARKED MOTOR VEHICLE" "PEDALCYCLIST" "ANGLE" "SIDESWIPE OPPOSITE DIRECTION" ...
## $ TRAFFICWAY_TYPE : chr [1:804227] "DIVIDED - W/MEDIAN (NOT RAISED)" "NOT DIVIDED" "PARKING LOT" "NOT DIVIDED" ...
## $ ALIGNMENT : chr [1:804227] "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" ...
## $ ROADWAY_SURFACE_COND : chr [1:804227] "DRY" "DRY" "UNKNOWN" "UNKNOWN" ...
## $ ROAD_DEFECT : chr [1:804227] "NO DEFECTS" "NO DEFECTS" "UNKNOWN" "UNKNOWN" ...
## $ CRASH_TYPE : chr [1:804227] "NO INJURY / DRIVE AWAY" "INJURY AND / OR TOW DUE TO CRASH" "NO INJURY / DRIVE AWAY" "NO INJURY / DRIVE AWAY" ...
## $ INTERSECTION_RELATED_I : chr [1:804227] "" "" "" "" ...
## $ HIT_AND_RUN_I : chr [1:804227] "Y" "" "Y" "Y" ...
## $ DAMAGE : chr [1:804227] "OVER $1,500" "$501 - $1,500" "$501 - $1,500" "OVER $1,500" ...
## $ PRIM_CONTRIBUTORY_CAUSE : chr [1:804227] "FAILING TO REDUCE SPEED TO AVOID CRASH" "FAILING TO REDUCE SPEED TO AVOID CRASH" "UNABLE TO DETERMINE" "UNABLE TO DETERMINE" ...
## $ SEC_CONTRIBUTORY_CAUSE : chr [1:804227] "OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER" "UNABLE TO DETERMINE" "NOT APPLICABLE" "NOT APPLICABLE" ...
## $ STREET_NO : chr [1:804227] "2101" "3422" "1320" "999" ...
## $ STREET_DIRECTION : chr [1:804227] "S" "N" "E" "N" ...
## $ STREET_NAME : chr [1:804227] "ASHLAND AVE" "LONG AVE" "47TH ST" "CALIFORNIA AVE" ...
## $ BEAT_OF_OCCURRENCE : int [1:804227] 1235 1633 222 1211 412 731 1712 1811 1214 2525 ...
## $ NUM_UNITS : int [1:804227] 4 2 2 2 2 2 3 2 2 2 ...
## $ MOST_SEVERE_INJURY : chr [1:804227] "NO INDICATION OF INJURY" "NONINCAPACITATING INJURY" "NO INDICATION OF INJURY" "NO INDICATION OF INJURY" ...
## $ INJURIES_TOTAL : int [1:804227] 0 1 0 0 0 5 0 0 0 0 ...
## $ INJURIES_FATAL : int [1:804227] 0 0 0 0 0 0 0 0 0 0 ...
## $ INJURIES_INCAPACITATING : int [1:804227] 0 0 0 0 0 0 0 0 0 0 ...
## $ INJURIES_NON_INCAPACITATING : int [1:804227] 0 1 0 0 0 5 0 0 0 0 ...
## $ INJURIES_REPORTED_NOT_EVIDENT: int [1:804227] 0 0 0 0 0 0 0 0 0 0 ...
## $ INJURIES_NO_INDICATION : int [1:804227] 1 1 2 2 2 0 4 2 3 1 ...
## $ CRASH_HOUR : int [1:804227] 14 17 14 0 12 19 10 18 14 12 ...
## $ CRASH_DAY_OF_WEEK : int [1:804227] 7 6 7 7 4 4 1 4 7 1 ...
## $ CRASH_MONTH : int [1:804227] 7 8 7 7 9 8 8 9 8 7 ...
## $ LATITUDE : num [1:804227] 41.9 41.9 41.8 41.9 41.7 ...
## $ LONGITUDE : num [1:804227] -87.7 -87.8 -87.6 -87.7 -87.6 ...
## $ crash_year : num [1:804227] 2023 2023 2023 2023 2023 ...
## $ address : chr [1:804227] "2101 S ASHLAND AVE" "3422 N LONG AVE" "1320 E 47TH ST" "999 N CALIFORNIA AVE" ...
# Rounding at mid-point to nearest round_unit
round_unit <- 5
crash_df$POSTED_SPEED_LIMIT <- (crash_df$POSTED_SPEED_LIMIT %/% round_unit * round_unit) + round((crash_df$POSTED_SPEED_LIMIT %% round_unit) / round_unit) * round_unit
# Counting the occurrences of each rounded value
table(crash_df$POSTED_SPEED_LIMIT)
##
## 0 5 10 15 20 25 30 35 40 45 50
## 7455 4831 18706 28251 32902 50705 593652 53804 7741 5191 218
## 55 60 65 70 100
## 642 43 15 5 66
# Create 'has_injuries' column based on injuries_total
crash_df$has_injuries <- as.integer(crash_df$INJURIES_TOTAL > 0)
# Count the occurrences of each unique value in 'has_injuries' column
table(crash_df$has_injuries)
##
## 0 1
## 693524 110703
# Create 'has_fatal' column based on injuries_fatal
crash_df$has_fatal <- as.integer(crash_df$INJURIES_FATAL > 0)
# Count the occurrences of each unique value in 'has_fatal' column
table(crash_df$has_fatal)
##
## 0 1
## 802361 883
# Create 'crash_time_of_day' column using cut function
crash_df$crash_time_of_day <- cut(crash_df$CRASH_HOUR, breaks = c(-Inf, 6, 12, 18, Inf), labels = c("overnight", "morning", "mid_day", "evening"))
# Count the occurrences of each category in 'crash_time_of_day' column
table(crash_df$crash_time_of_day)
##
## overnight morning mid_day evening
## 93200 238054 335923 137050
# Define list of feature names
features_names <- c(
'crash_date',
'crash_year',
'crash_month',
'crash_day_of_week',
'crash_hour',
'crash_time_of_day', # New
'latitude',
'longitude',
'beat_of_occurrence',
'address',
'street_no',
'street_direction',
'street_name',
'posted_speed_limit',
'traffic_control_device',
'device_condition',
'weather_condition',
'lighting_condition',
'trafficway_type',
'alignment',
'roadway_surface_cond',
'road_defect',
'first_crash_type',
'prim_contributory_cause',
'sec_contributory_cause',
'num_units'
)
# Define list of target names
target_names <- c(
'has_injuries', # New
'has_fatal', # New
'crash_type',
'damage',
'injuries_total',
'injuries_fatal'
)
# Combine feature and target names
all_columns <- c(features_names, target_names)
colnames(crash_df)
## [1] "CRASH_DATE" "POSTED_SPEED_LIMIT"
## [3] "TRAFFIC_CONTROL_DEVICE" "DEVICE_CONDITION"
## [5] "WEATHER_CONDITION" "LIGHTING_CONDITION"
## [7] "FIRST_CRASH_TYPE" "TRAFFICWAY_TYPE"
## [9] "ALIGNMENT" "ROADWAY_SURFACE_COND"
## [11] "ROAD_DEFECT" "CRASH_TYPE"
## [13] "INTERSECTION_RELATED_I" "HIT_AND_RUN_I"
## [15] "DAMAGE" "PRIM_CONTRIBUTORY_CAUSE"
## [17] "SEC_CONTRIBUTORY_CAUSE" "STREET_NO"
## [19] "STREET_DIRECTION" "STREET_NAME"
## [21] "BEAT_OF_OCCURRENCE" "NUM_UNITS"
## [23] "MOST_SEVERE_INJURY" "INJURIES_TOTAL"
## [25] "INJURIES_FATAL" "INJURIES_INCAPACITATING"
## [27] "INJURIES_NON_INCAPACITATING" "INJURIES_REPORTED_NOT_EVIDENT"
## [29] "INJURIES_NO_INDICATION" "CRASH_HOUR"
## [31] "CRASH_DAY_OF_WEEK" "CRASH_MONTH"
## [33] "LATITUDE" "LONGITUDE"
## [35] "crash_year" "address"
## [37] "has_injuries" "has_fatal"
## [39] "crash_time_of_day"
# Assuming df_c is your dataframe
names(crash_df) <- tolower(names(crash_df))
colnames(crash_df)
## [1] "crash_date" "posted_speed_limit"
## [3] "traffic_control_device" "device_condition"
## [5] "weather_condition" "lighting_condition"
## [7] "first_crash_type" "trafficway_type"
## [9] "alignment" "roadway_surface_cond"
## [11] "road_defect" "crash_type"
## [13] "intersection_related_i" "hit_and_run_i"
## [15] "damage" "prim_contributory_cause"
## [17] "sec_contributory_cause" "street_no"
## [19] "street_direction" "street_name"
## [21] "beat_of_occurrence" "num_units"
## [23] "most_severe_injury" "injuries_total"
## [25] "injuries_fatal" "injuries_incapacitating"
## [27] "injuries_non_incapacitating" "injuries_reported_not_evident"
## [29] "injuries_no_indication" "crash_hour"
## [31] "crash_day_of_week" "crash_month"
## [33] "latitude" "longitude"
## [35] "crash_year" "address"
## [37] "has_injuries" "has_fatal"
## [39] "crash_time_of_day"
file_crash_df_parquet <- "crash_df.parquet"
# Write dataframe to Parquet format
write_parquet(crash_df[all_columns], file_crash_df_parquet)
# Replace crash_df with your data frame variable name and all_columns with the columns you want to include
write_parquet(crash_df[, all_columns], "file_crash_df.parquet")
# Replace file_crash_df_parquet with the path to your Parquet file
crash_df <- read_parquet("file_crash_df.parquet")
# Replace file_crash_df_parquet with the path to your Parquet file
crash_df <- read_parquet("file_crash_df.parquet")
# Replace crash_df with your data frame variable name and target_names with the names of the columns you want to select
crash_targets <- crash_df %>%
select(target_names)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(target_names)
##
## # Now:
## data %>% select(all_of(target_names))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Print information about the selected columns
str(crash_targets)
## tibble [804,227 × 6] (S3: tbl_df/tbl/data.frame)
## $ has_injuries : int [1:804227] 0 1 0 0 0 1 0 0 0 0 ...
## $ has_fatal : int [1:804227] 0 0 0 0 0 0 0 0 0 0 ...
## $ crash_type : chr [1:804227] "NO INJURY / DRIVE AWAY" "INJURY AND / OR TOW DUE TO CRASH" "NO INJURY / DRIVE AWAY" "NO INJURY / DRIVE AWAY" ...
## $ damage : chr [1:804227] "OVER $1,500" "$501 - $1,500" "$501 - $1,500" "OVER $1,500" ...
## $ injuries_total: int [1:804227] 0 1 0 0 0 5 0 0 0 0 ...
## $ injuries_fatal: int [1:804227] 0 0 0 0 0 0 0 0 0 0 ...
str(crash_df)
## tibble [804,227 × 32] (S3: tbl_df/tbl/data.frame)
## $ crash_date : POSIXct[1:804227], format: "2023-07-29 14:45:00" "2023-08-18 17:58:00" ...
## $ crash_year : num [1:804227] 2023 2023 2023 2023 2023 ...
## $ crash_month : int [1:804227] 7 8 7 7 9 8 8 9 8 7 ...
## $ crash_day_of_week : int [1:804227] 7 6 7 7 4 4 1 4 7 1 ...
## $ crash_hour : int [1:804227] 14 17 14 0 12 19 10 18 14 12 ...
## $ crash_time_of_day : Factor w/ 4 levels "overnight","morning",..: 3 3 3 1 2 4 2 3 3 2 ...
## $ latitude : num [1:804227] 41.9 41.9 41.8 41.9 41.7 ...
## $ longitude : num [1:804227] -87.7 -87.8 -87.6 -87.7 -87.6 ...
## $ beat_of_occurrence : int [1:804227] 1235 1633 222 1211 412 731 1712 1811 1214 2525 ...
## $ address : chr [1:804227] "2101 S ASHLAND AVE" "3422 N LONG AVE" "1320 E 47TH ST" "999 N CALIFORNIA AVE" ...
## $ street_no : chr [1:804227] "2101" "3422" "1320" "999" ...
## $ street_direction : chr [1:804227] "S" "N" "E" "N" ...
## $ street_name : chr [1:804227] "ASHLAND AVE" "LONG AVE" "47TH ST" "CALIFORNIA AVE" ...
## $ posted_speed_limit : num [1:804227] 30 30 10 30 15 30 30 30 30 20 ...
## $ traffic_control_device : chr [1:804227] "TRAFFIC SIGNAL" "NO CONTROLS" "NO CONTROLS" "TRAFFIC SIGNAL" ...
## $ device_condition : chr [1:804227] "FUNCTIONING PROPERLY" "NO CONTROLS" "NO CONTROLS" "FUNCTIONING PROPERLY" ...
## $ weather_condition : chr [1:804227] "CLEAR" "CLEAR" "UNKNOWN" "CLEAR" ...
## $ lighting_condition : chr [1:804227] "DAYLIGHT" "DAYLIGHT" "UNKNOWN" "DARKNESS" ...
## $ trafficway_type : chr [1:804227] "DIVIDED - W/MEDIAN (NOT RAISED)" "NOT DIVIDED" "PARKING LOT" "NOT DIVIDED" ...
## $ alignment : chr [1:804227] "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" ...
## $ roadway_surface_cond : chr [1:804227] "DRY" "DRY" "UNKNOWN" "UNKNOWN" ...
## $ road_defect : chr [1:804227] "NO DEFECTS" "NO DEFECTS" "UNKNOWN" "UNKNOWN" ...
## $ first_crash_type : chr [1:804227] "PARKED MOTOR VEHICLE" "PEDALCYCLIST" "ANGLE" "SIDESWIPE OPPOSITE DIRECTION" ...
## $ prim_contributory_cause: chr [1:804227] "FAILING TO REDUCE SPEED TO AVOID CRASH" "FAILING TO REDUCE SPEED TO AVOID CRASH" "UNABLE TO DETERMINE" "UNABLE TO DETERMINE" ...
## $ sec_contributory_cause : chr [1:804227] "OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER" "UNABLE TO DETERMINE" "NOT APPLICABLE" "NOT APPLICABLE" ...
## $ num_units : int [1:804227] 4 2 2 2 2 2 3 2 2 2 ...
## $ has_injuries : int [1:804227] 0 1 0 0 0 1 0 0 0 0 ...
## $ has_fatal : int [1:804227] 0 0 0 0 0 0 0 0 0 0 ...
## $ crash_type : chr [1:804227] "NO INJURY / DRIVE AWAY" "INJURY AND / OR TOW DUE TO CRASH" "NO INJURY / DRIVE AWAY" "NO INJURY / DRIVE AWAY" ...
## $ damage : chr [1:804227] "OVER $1,500" "$501 - $1,500" "$501 - $1,500" "OVER $1,500" ...
## $ injuries_total : int [1:804227] 0 1 0 0 0 5 0 0 0 0 ...
## $ injuries_fatal : int [1:804227] 0 0 0 0 0 0 0 0 0 0 ...
# File path to the Parquet file
file_crash_df_parquet <- "crash_df.parquet"
# Read the Parquet file into a dataframe
crash_df <- arrow::read_parquet(file_crash_df_parquet)
# Print information about the dataframe
str(crash_df)
## tibble [804,227 × 32] (S3: tbl_df/tbl/data.frame)
## $ crash_date : POSIXct[1:804227], format: "2023-07-29 14:45:00" "2023-08-18 17:58:00" ...
## $ crash_year : num [1:804227] 2023 2023 2023 2023 2023 ...
## $ crash_month : int [1:804227] 7 8 7 7 9 8 8 9 8 7 ...
## $ crash_day_of_week : int [1:804227] 7 6 7 7 4 4 1 4 7 1 ...
## $ crash_hour : int [1:804227] 14 17 14 0 12 19 10 18 14 12 ...
## $ crash_time_of_day : Factor w/ 4 levels "overnight","morning",..: 3 3 3 1 2 4 2 3 3 2 ...
## $ latitude : num [1:804227] 41.9 41.9 41.8 41.9 41.7 ...
## $ longitude : num [1:804227] -87.7 -87.8 -87.6 -87.7 -87.6 ...
## $ beat_of_occurrence : int [1:804227] 1235 1633 222 1211 412 731 1712 1811 1214 2525 ...
## $ address : chr [1:804227] "2101 S ASHLAND AVE" "3422 N LONG AVE" "1320 E 47TH ST" "999 N CALIFORNIA AVE" ...
## $ street_no : chr [1:804227] "2101" "3422" "1320" "999" ...
## $ street_direction : chr [1:804227] "S" "N" "E" "N" ...
## $ street_name : chr [1:804227] "ASHLAND AVE" "LONG AVE" "47TH ST" "CALIFORNIA AVE" ...
## $ posted_speed_limit : num [1:804227] 30 30 10 30 15 30 30 30 30 20 ...
## $ traffic_control_device : chr [1:804227] "TRAFFIC SIGNAL" "NO CONTROLS" "NO CONTROLS" "TRAFFIC SIGNAL" ...
## $ device_condition : chr [1:804227] "FUNCTIONING PROPERLY" "NO CONTROLS" "NO CONTROLS" "FUNCTIONING PROPERLY" ...
## $ weather_condition : chr [1:804227] "CLEAR" "CLEAR" "UNKNOWN" "CLEAR" ...
## $ lighting_condition : chr [1:804227] "DAYLIGHT" "DAYLIGHT" "UNKNOWN" "DARKNESS" ...
## $ trafficway_type : chr [1:804227] "DIVIDED - W/MEDIAN (NOT RAISED)" "NOT DIVIDED" "PARKING LOT" "NOT DIVIDED" ...
## $ alignment : chr [1:804227] "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" "STRAIGHT AND LEVEL" ...
## $ roadway_surface_cond : chr [1:804227] "DRY" "DRY" "UNKNOWN" "UNKNOWN" ...
## $ road_defect : chr [1:804227] "NO DEFECTS" "NO DEFECTS" "UNKNOWN" "UNKNOWN" ...
## $ first_crash_type : chr [1:804227] "PARKED MOTOR VEHICLE" "PEDALCYCLIST" "ANGLE" "SIDESWIPE OPPOSITE DIRECTION" ...
## $ prim_contributory_cause: chr [1:804227] "FAILING TO REDUCE SPEED TO AVOID CRASH" "FAILING TO REDUCE SPEED TO AVOID CRASH" "UNABLE TO DETERMINE" "UNABLE TO DETERMINE" ...
## $ sec_contributory_cause : chr [1:804227] "OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER" "UNABLE TO DETERMINE" "NOT APPLICABLE" "NOT APPLICABLE" ...
## $ num_units : int [1:804227] 4 2 2 2 2 2 3 2 2 2 ...
## $ has_injuries : int [1:804227] 0 1 0 0 0 1 0 0 0 0 ...
## $ has_fatal : int [1:804227] 0 0 0 0 0 0 0 0 0 0 ...
## $ crash_type : chr [1:804227] "NO INJURY / DRIVE AWAY" "INJURY AND / OR TOW DUE TO CRASH" "NO INJURY / DRIVE AWAY" "NO INJURY / DRIVE AWAY" ...
## $ damage : chr [1:804227] "OVER $1,500" "$501 - $1,500" "$501 - $1,500" "OVER $1,500" ...
## $ injuries_total : int [1:804227] 0 1 0 0 0 5 0 0 0 0 ...
## $ injuries_fatal : int [1:804227] 0 0 0 0 0 0 0 0 0 0 ...
# Subset the data where crash_year > 2017 and crash_year < 2021
subset_df <- subset(crash_df, crash_year > 2017 & crash_year < 2021)
# Print the subsetted dataframe
head(subset_df)
## # A tibble: 6 × 32
## crash_date crash_year crash_month crash_day_of_week crash_hour
## <dttm> <dbl> <int> <int> <int>
## 1 2019-08-18 19:27:00 2019 8 1 19
## 2 2018-07-27 10:18:00 2018 7 6 10
## 3 2020-05-22 04:00:00 2020 5 6 4
## 4 2019-06-11 08:40:00 2019 6 3 8
## 5 2020-01-07 05:45:00 2020 1 3 5
## 6 2019-03-17 01:53:00 2019 3 1 1
## # ℹ 27 more variables: crash_time_of_day <fct>, latitude <dbl>,
## # longitude <dbl>, beat_of_occurrence <int>, address <chr>, street_no <chr>,
## # street_direction <chr>, street_name <chr>, posted_speed_limit <dbl>,
## # traffic_control_device <chr>, device_condition <chr>,
## # weather_condition <chr>, lighting_condition <chr>, trafficway_type <chr>,
## # alignment <chr>, roadway_surface_cond <chr>, road_defect <chr>,
## # first_crash_type <chr>, prim_contributory_cause <chr>, …
# Filter rows based on the condition (crash_year > 2017 & crash_year < 2021)
crash_df <- crash_df %>%
filter(crash_year > 2017 & crash_year < 2021)
# Retrieve column names of the dataframe crash_df
column_names <- colnames(crash_df)
column_names
## [1] "crash_date" "crash_year"
## [3] "crash_month" "crash_day_of_week"
## [5] "crash_hour" "crash_time_of_day"
## [7] "latitude" "longitude"
## [9] "beat_of_occurrence" "address"
## [11] "street_no" "street_direction"
## [13] "street_name" "posted_speed_limit"
## [15] "traffic_control_device" "device_condition"
## [17] "weather_condition" "lighting_condition"
## [19] "trafficway_type" "alignment"
## [21] "roadway_surface_cond" "road_defect"
## [23] "first_crash_type" "prim_contributory_cause"
## [25] "sec_contributory_cause" "num_units"
## [27] "has_injuries" "has_fatal"
## [29] "crash_type" "damage"
## [31] "injuries_total" "injuries_fatal"
nrow(crash_df)
## [1] 326469
table(crash_df$crash_year)
##
## 2018 2019 2020
## 118377 116736 91356
features_names <- c(
'crash_date',
'crash_year',
'crash_month',
'crash_day_of_week',
'crash_hour',
'crash_time_of_day', # New
'latitude',
'longitude',
'beat_of_occurrence',
'address',
# 'street_no',
# 'street_direction',
# 'street_name',
'posted_speed_limit',
'traffic_control_device',
'device_condition',
'weather_condition',
'lighting_condition',
'trafficway_type',
'alignment',
'roadway_surface_cond',
'road_defect',
'first_crash_type',
'prim_contributory_cause',
'sec_contributory_cause',
'num_units'
)
target_names <- c(
'has_injuries', # New
'has_fatal', # New
'crash_type',
'damage',
'injuries_total',
'injuries_fatal'
# 'injuries_incapacitating',
# 'inj_non_incap',
# 'inj_report_not_evdnt',
# 'injuries_no_indication',
# 'most_severe_injury'
)
all_columns <- c(features_names, target_names)
all_columns
## [1] "crash_date" "crash_year"
## [3] "crash_month" "crash_day_of_week"
## [5] "crash_hour" "crash_time_of_day"
## [7] "latitude" "longitude"
## [9] "beat_of_occurrence" "address"
## [11] "posted_speed_limit" "traffic_control_device"
## [13] "device_condition" "weather_condition"
## [15] "lighting_condition" "trafficway_type"
## [17] "alignment" "roadway_surface_cond"
## [19] "road_defect" "first_crash_type"
## [21] "prim_contributory_cause" "sec_contributory_cause"
## [23] "num_units" "has_injuries"
## [25] "has_fatal" "crash_type"
## [27] "damage" "injuries_total"
## [29] "injuries_fatal"
table(crash_df$posted_speed_limit)
##
## 0 5 10 15 20 25 30 35 40 45 50
## 3225 2313 7134 11672 12902 20291 240562 23065 3102 1870 72
## 55 60 65 70
## 241 12 5 3
table(round(crash_df$posted_speed_limit, -1))
##
## 0 10 20 30 40 50 60 70
## 5538 7134 44865 240562 28037 72 258 3
crash_df %>%
mutate(rounded_speed_limit = (posted_speed_limit %/% 10 * 10) + round((posted_speed_limit %% 10) / 10) * 10) %>%
count(rounded_speed_limit)
## # A tibble: 8 × 2
## rounded_speed_limit n
## <dbl> <int>
## 1 0 5538
## 2 10 18806
## 3 20 33193
## 4 30 263627
## 5 40 4972
## 6 50 313
## 7 60 17
## 8 70 3
crash_df %>%
mutate(rounded_speed_limit = (posted_speed_limit %/% 5 * 5) + round((posted_speed_limit %% 5) / 5) * 5) %>%
count(rounded_speed_limit)
## # A tibble: 15 × 2
## rounded_speed_limit n
## <dbl> <int>
## 1 0 3225
## 2 5 2313
## 3 10 7134
## 4 15 11672
## 5 20 12902
## 6 25 20291
## 7 30 240562
## 8 35 23065
## 9 40 3102
## 10 45 1870
## 11 50 72
## 12 55 241
## 13 60 12
## 14 65 5
## 15 70 3
rounding_function <- function(x) {
(floor(x / 10) * 10) + round((x %% 10) / 10) * 10
}
result <- rounding_function(6)
result
## [1] 10
crash_df %>%
pull(beat_of_occurrence) %>%
n_distinct()
## [1] 271
crash_df %>%
pull(address) %>%
n_distinct()
## [1] 170695
crash_df %>%
group_by(latitude, longitude) %>%
summarise(n = n())
## `summarise()` has grouped output by 'latitude'. You can override using the
## `.groups` argument.
## # A tibble: 162,486 × 3
## # Groups: latitude [162,433]
## latitude longitude n
## <dbl> <dbl> <int>
## 1 41.6 -87.5 6
## 2 41.6 -87.6 1
## 3 41.6 -87.6 4
## 4 41.6 -87.6 2
## 5 41.6 -87.6 1
## 6 41.6 -87.6 1
## 7 41.6 -87.6 1
## 8 41.6 -87.6 1
## 9 41.6 -87.6 1
## 10 41.6 -87.6 1
## # ℹ 162,476 more rows
table(crash_df$crash_type)
##
## INJURY AND / OR TOW DUE TO CRASH NO INJURY / DRIVE AWAY
## 91008 235461
table(as.integer(crash_df$injuries_total > 0))
##
## 0 1
## 279680 46789
table(as.integer(crash_df$injuries_fatal > 0))
##
## 0 1
## 325701 347
table(crash_df$first_crash_type)
##
## ANGLE ANIMAL
## 34943 242
## FIXED OBJECT HEAD ON
## 16280 2739
## OTHER NONCOLLISION OTHER OBJECT
## 1121 3356
## OVERTURNED PARKED MOTOR VEHICLE
## 185 75272
## PEDALCYCLIST PEDESTRIAN
## 5158 8530
## REAR END REAR TO FRONT
## 74338 2470
## REAR TO REAR REAR TO SIDE
## 560 1539
## SIDESWIPE OPPOSITE DIRECTION SIDESWIPE SAME DIRECTION
## 4701 48197
## TRAIN TURNING
## 23 46815
table(crash_df$trafficway_type)
##
## ALLEY CENTER TURN LANE
## 5506 2897
## DIVIDED - W/MEDIAN (NOT RAISED) DIVIDED - W/MEDIAN BARRIER
## 55313 19167
## DRIVEWAY FIVE POINT, OR MORE
## 1131 351
## FOUR WAY L-INTERSECTION
## 13794 47
## NOT DIVIDED NOT REPORTED
## 146295 109
## ONE-WAY OTHER
## 42347 8752
## PARKING LOT RAMP
## 22138 1029
## ROUNDABOUT T-INTERSECTION
## 85 2896
## TRAFFIC ROUTE UNKNOWN
## 259 3127
## UNKNOWN INTERSECTION TYPE Y-INTERSECTION
## 872 354
table(crash_df$prim_contributory_cause)
##
## ANIMAL
## 301
## BICYCLE ADVANCING LEGALLY ON RED LIGHT
## 42
## CELL PHONE USE OTHER THAN TEXTING
## 471
## DISREGARDING OTHER TRAFFIC SIGNS
## 753
## DISREGARDING ROAD MARKINGS
## 459
## DISREGARDING STOP SIGN
## 3846
## DISREGARDING TRAFFIC SIGNALS
## 6600
## DISREGARDING YIELD SIGN
## 135
## DISTRACTION - FROM INSIDE VEHICLE
## 2427
## DISTRACTION - FROM OUTSIDE VEHICLE
## 1527
## DISTRACTION - OTHER ELECTRONIC DEVICE (NAVIGATION DEVICE, DVD PLAYER, ETC.)
## 175
## DRIVING ON WRONG SIDE/WRONG WAY
## 1658
## DRIVING SKILLS/KNOWLEDGE/EXPERIENCE
## 9904
## EQUIPMENT - VEHICLE CONDITION
## 2242
## EVASIVE ACTION DUE TO ANIMAL, OBJECT, NONMOTORIST
## 613
## EXCEEDING AUTHORIZED SPEED LIMIT
## 1025
## EXCEEDING SAFE SPEED FOR CONDITIONS
## 981
## FAILING TO REDUCE SPEED TO AVOID CRASH
## 15363
## FAILING TO YIELD RIGHT-OF-WAY
## 36513
## FOLLOWING TOO CLOSELY
## 32819
## HAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE)
## 368
## IMPROPER BACKING
## 13817
## IMPROPER LANE USAGE
## 12358
## IMPROPER OVERTAKING/PASSING
## 15271
## IMPROPER TURNING/NO SIGNAL
## 11089
## MOTORCYCLE ADVANCING LEGALLY ON RED LIGHT
## 15
## NOT APPLICABLE
## 17708
## OBSTRUCTED CROSSWALKS
## 28
## OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER
## 4363
## PASSING STOPPED SCHOOL BUS
## 47
## PHYSICAL CONDITION OF DRIVER
## 2105
## RELATED TO BUS STOP
## 151
## ROAD CONSTRUCTION/MAINTENANCE
## 803
## ROAD ENGINEERING/SURFACE/MARKING DEFECTS
## 967
## TEXTING
## 141
## TURNING RIGHT ON RED
## 230
## UNABLE TO DETERMINE
## 119789
## UNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED)
## 2006
## VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)
## 2044
## WEATHER
## 5315
table(crash_df$sec_contributory_cause)
##
## ANIMAL
## 156
## BICYCLE ADVANCING LEGALLY ON RED LIGHT
## 119
## CELL PHONE USE OTHER THAN TEXTING
## 264
## DISREGARDING OTHER TRAFFIC SIGNS
## 355
## DISREGARDING ROAD MARKINGS
## 355
## DISREGARDING STOP SIGN
## 930
## DISREGARDING TRAFFIC SIGNALS
## 1290
## DISREGARDING YIELD SIGN
## 91
## DISTRACTION - FROM INSIDE VEHICLE
## 1036
## DISTRACTION - FROM OUTSIDE VEHICLE
## 602
## DISTRACTION - OTHER ELECTRONIC DEVICE (NAVIGATION DEVICE, DVD PLAYER, ETC.)
## 109
## DRIVING ON WRONG SIDE/WRONG WAY
## 629
## DRIVING SKILLS/KNOWLEDGE/EXPERIENCE
## 9925
## EQUIPMENT - VEHICLE CONDITION
## 718
## EVASIVE ACTION DUE TO ANIMAL, OBJECT, NONMOTORIST
## 173
## EXCEEDING AUTHORIZED SPEED LIMIT
## 787
## EXCEEDING SAFE SPEED FOR CONDITIONS
## 790
## FAILING TO REDUCE SPEED TO AVOID CRASH
## 13768
## FAILING TO YIELD RIGHT-OF-WAY
## 9389
## FOLLOWING TOO CLOSELY
## 8205
## HAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE)
## 405
## IMPROPER BACKING
## 2553
## IMPROPER LANE USAGE
## 4472
## IMPROPER OVERTAKING/PASSING
## 4690
## IMPROPER TURNING/NO SIGNAL
## 3237
## MOTORCYCLE ADVANCING LEGALLY ON RED LIGHT
## 31
## NOT APPLICABLE
## 137441
## OBSTRUCTED CROSSWALKS
## 45
## OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER
## 2261
## PASSING STOPPED SCHOOL BUS
## 46
## PHYSICAL CONDITION OF DRIVER
## 1067
## RELATED TO BUS STOP
## 175
## ROAD CONSTRUCTION/MAINTENANCE
## 427
## ROAD ENGINEERING/SURFACE/MARKING DEFECTS
## 360
## TEXTING
## 63
## TURNING RIGHT ON RED
## 120
## UNABLE TO DETERMINE
## 113655
## UNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED)
## 621
## VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)
## 1166
## WEATHER
## 3943
table(crash_df$traffic_control_device)
##
## BICYCLE CROSSING SIGN DELINEATORS FLASHING CONTROL SIGNAL
## 14 98 111
## LANE USE MARKING NO CONTROLS NO PASSING
## 752 186811 14
## OTHER OTHER RAILROAD CROSSING OTHER REG. SIGN
## 2039 71 374
## OTHER WARNING SIGN PEDESTRIAN CROSSING SIGN POLICE/FLAGMAN
## 316 195 126
## RAILROAD CROSSING GATE RR CROSSING SIGN SCHOOL ZONE
## 195 38 111
## STOP SIGN/FLASHER TRAFFIC SIGNAL UNKNOWN
## 33451 90908 10379
## YIELD
## 466
summary(crash_df$injuries_total)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.197 0.000 21.000
# Week day mapping
week_days <- c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")
# Weekday/Weekend mapping
is_weekday <- c("Weekend", "Weekday")
# Assuming crash_df is your dataframe
par(mfrow=c(3, 1)) # Set the layout to 3 rows and 1 column
for(year in unique(crash_df$crash_year)) {
hist(subset(crash_df, crash_year == year)$crash_hour,
breaks = 24,
main = paste("Crashes by Hour - Year", year),
xlab = "Hour",
ylab = "Frequency",
col = "lightblue",
xlim = c(0, 24),
ylim = c(0, max(table(crash_df$crash_hour))),
axes = FALSE)
axis(1, at=seq(0, 24, by=2)) # Add x-axis labels every 2 hours
axis(2) # Add y-axis
}

# Filter dataframe for crashes with injuries
crashes_with_injuries <- subset(crash_df, has_injuries == 1)
# Plot histograms by crash_year
ggplot(crashes_with_injuries, aes(x = crash_hour)) +
geom_histogram(binwidth = 1) +
facet_wrap(~ crash_year, nrow = 3) +
theme_minimal() +
labs(x = "Crash Hour", y = "Count") +
theme(legend.position = "none")

z <- cut(crash_df$crash_hour, breaks = c(-Inf, 6, 12, 18, Inf), labels = c('overnight', 'morning', 'mid_day', 'evening'))
head(z)
## [1] evening morning overnight morning overnight overnight
## Levels: overnight morning mid_day evening
table(z)
## z
## overnight morning mid_day evening
## 37765 97575 135325 55804
plot(z)
sum(is.na(crash_df$crash_hour))
## [1] 0
sum(is.na(z))
## [1] 0
summary(crash_df$crash_hour)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 9.0 14.0 13.2 17.0 23.0
crash_agg <- crash_df %>%
group_by(crash_year, crash_month, crash_hour) %>%
summarise(injuries_total = sum(injuries_total, na.rm = TRUE),
injuries_fatal = sum(injuries_fatal, na.rm = TRUE)) %>%
mutate(across(c(injuries_total, injuries_fatal), ~ifelse(is.na(.), 0, .))) %>%
ungroup()
## `summarise()` has grouped output by 'crash_year', 'crash_month'. You can
## override using the `.groups` argument.
crash_pivot <- crash_agg %>%
pivot_wider(names_from = crash_year,
values_from = c(injuries_total, injuries_fatal),
values_fn = sum,
names_sep = "_")
# Aggregate data
crash_agg <- crash_df %>%
group_by(crash_year, crash_month, crash_time_of_day) %>%
summarise(has_injuries = sum(has_injuries),
has_fatal = sum(has_fatal),
injuries_total = sum(injuries_total),
injuries_fatal = sum(injuries_fatal)) %>%
mutate_at(vars(has_injuries, has_fatal, injuries_total, injuries_fatal), ~ ifelse(is.na(.), 0, .))
## `summarise()` has grouped output by 'crash_year', 'crash_month'. You can
## override using the `.groups` argument.
# Plotting
ggplot(crash_agg, aes(x = crash_month, y = has_injuries, color = crash_time_of_day)) +
geom_line() +
facet_wrap(~ crash_year, ncol = 3) +
labs(title = "Chicago Crashes - Time of Day Trends", y = "Number of Injuries") +
theme_bw() +
scale_x_continuous(breaks = 1:12)

# Aggregate data
crash_agg <- crash_df %>%
group_by(crash_year, crash_month, crash_day_of_week) %>%
summarise(has_injuries = sum(has_injuries),
injuries_total = sum(injuries_total),
injuries_fatal = sum(injuries_fatal)) %>%
mutate_at(vars(has_injuries, injuries_total, injuries_fatal), ~ ifelse(is.na(.), 0, .))
## `summarise()` has grouped output by 'crash_year', 'crash_month'. You can
## override using the `.groups` argument.
# Plotting
ggplot(crash_agg, aes(x = crash_month, y = has_injuries, fill = factor(crash_day_of_week))) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~ crash_year, ncol = 3) +
theme_minimal() +
labs(x = "Month", y = "Total Injuries", title = "Total Injuries by Month and Day of Week") +
scale_fill_brewer(palette = "Set1") +
guides(fill = guide_legend(title = "Day of Week")) +
scale_x_continuous(breaks = 1:12)

sum(crash_df$weather_condition == 'OTHER', na.rm = TRUE)
## [1] 1072
# Grouping and aggregating
crash_agg <- crash_df %>%
group_by(crash_year, crash_month, lighting_condition) %>%
summarise(has_injuries = sum(has_injuries, na.rm = TRUE),
injuries_total = sum(injuries_total, na.rm = TRUE),
injuries_fatal = sum(injuries_fatal, na.rm = TRUE)) %>%
mutate_at(vars(has_injuries, injuries_total, injuries_fatal), ~ ifelse(is.na(.), 0, .)) %>%
ungroup() %>%
filter(lighting_condition != "UNKNOWN")
## `summarise()` has grouped output by 'crash_year', 'crash_month'. You can
## override using the `.groups` argument.
# Creating FacetGrid plot
ggplot(crash_agg, aes(x = crash_month, y = has_injuries, col = lighting_condition)) +
geom_line() +
facet_wrap(~ crash_year, ncol = 3) +
labs(title = "Chicago Crashes - Lighting Condition Trends", y = "Has Injuries") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))+
scale_x_continuous(breaks = 1:12)

crash_agg <- crash_df %>%
group_by(crash_year, crash_month, weather_condition) %>%
summarise(has_injuries = sum(has_injuries),
injuries_total = sum(injuries_total),
injuries_fatal = sum(injuries_fatal)) %>%
mutate(across(c(has_injuries, injuries_total, injuries_fatal), ~ifelse(is.na(.), 0, .))) %>%
ungroup()
## `summarise()` has grouped output by 'crash_year', 'crash_month'. You can
## override using the `.groups` argument.
ggplot(crash_agg, aes(x = crash_month, y = has_injuries, color = weather_condition)) +
geom_line() +
facet_wrap(~crash_year, ncol = 3) +
labs(title = "Chicago Crashes - Weather Condition Trends", y = "Total Injuries") +
theme(plot.title = element_text(hjust = 0.5))+
scale_x_continuous(breaks = 1:12)

# Grouping and aggregating
crash_agg <- crash_df %>%
group_by(crash_year, crash_month, weather_condition) %>%
summarise(has_injuries = sum(has_injuries),
injuries_total = sum(injuries_total),
injuries_fatal = sum(injuries_fatal)) %>%
mutate_at(vars(has_injuries, injuries_total, injuries_fatal), ~ ifelse(is.na(.), 0, .)) %>%
filter(!weather_condition %in% c("CLEAR")) %>%
ungroup()
## `summarise()` has grouped output by 'crash_year', 'crash_month'. You can
## override using the `.groups` argument.
# Creating facet grid
ggplot(crash_agg, aes(x = crash_month, y = has_injuries, color = weather_condition)) +
geom_line() +
facet_wrap(~crash_year, ncol = 3) +
labs(title = "Chicago Crashes - Weather Condition Trends (Excluding CLEAR)", y = "Has Injuries") +
theme(plot.title = element_text(face = "bold"))+
scale_x_continuous(breaks = 1:12)

# Create a copy of the dataframe
crash_sdf <- crash_df
# Select columns to be standardized
col_names <- c('injuries_total', 'injuries_fatal')
features <- crash_sdf[col_names]
# Standardize the selected columns
scaled_features <- scale(features)
# Update the original dataframe with standardized values
crash_sdf[col_names] <- scaled_features
# Display the first 5 rows of the updated dataframe
head(crash_sdf, 5)
## # A tibble: 5 × 32
## crash_date crash_year crash_month crash_day_of_week crash_hour
## <dttm> <dbl> <int> <int> <int>
## 1 2019-08-18 19:27:00 2019 8 1 19
## 2 2018-07-27 10:18:00 2018 7 6 10
## 3 2020-05-22 04:00:00 2020 5 6 4
## 4 2019-06-11 08:40:00 2019 6 3 8
## 5 2020-01-07 05:45:00 2020 1 3 5
## # ℹ 27 more variables: crash_time_of_day <fct>, latitude <dbl>,
## # longitude <dbl>, beat_of_occurrence <int>, address <chr>, street_no <chr>,
## # street_direction <chr>, street_name <chr>, posted_speed_limit <dbl>,
## # traffic_control_device <chr>, device_condition <chr>,
## # weather_condition <chr>, lighting_condition <chr>, trafficway_type <chr>,
## # alignment <chr>, roadway_surface_cond <chr>, road_defect <chr>,
## # first_crash_type <chr>, prim_contributory_cause <chr>, …
# Plot Map
crash_agg <- crash_df %>%
group_by(longitude, latitude) %>%
summarise(crashes = n()) %>%
ungroup()
## `summarise()` has grouped output by 'longitude'. You can override using the
## `.groups` argument.
head(crash_agg)
## # A tibble: 6 × 3
## longitude latitude crashes
## <dbl> <dbl> <int>
## 1 -87.9 42.0 1
## 2 -87.9 42.0 1
## 3 -87.9 42.0 23
## 4 -87.9 42.0 1
## 5 -87.9 42.0 4
## 6 -87.9 42.0 5
# Create a copy of crash_df
crash_df_ <- crash_df
# Group by 'longitude', 'latitude', and 'crash_year', then summarize
crash_df_ <- crash_df_ %>%
group_by(longitude, latitude, crash_year) %>%
summarize(
crashes = n(),
has_injuries = max(has_injuries),
is_weekday = max(is_weekday) # is_weekday is a binary indicator
) %>%
ungroup() %>%
filter(crashes > 0)
## `summarise()` has grouped output by 'longitude', 'latitude'. You can override
## using the `.groups` argument.
head(crash_df_)
## # A tibble: 6 × 6
## longitude latitude crash_year crashes has_injuries is_weekday
## <dbl> <dbl> <dbl> <int> <int> <chr>
## 1 -87.9 42.0 2020 1 0 Weekend
## 2 -87.9 42.0 2019 1 0 Weekend
## 3 -87.9 42.0 2019 8 1 Weekend
## 4 -87.9 42.0 2020 15 1 Weekend
## 5 -87.9 42.0 2019 1 1 Weekend
## 6 -87.9 42.0 2019 2 0 Weekend
# Filter data for crash_year == 2018
crash_df_2018 <- subset(crash_df_, crash_year == 2018)
# Create scatter plot - 2018
ggplot(crash_df_2018, aes(x = longitude, y = latitude, size = crashes, color = has_injuries)) +
geom_point(alpha = 0.5) + # Add points with transparency
scale_size_continuous(range = c(0, 5)) + # Adjust the size range
labs(x = NULL, y = NULL, title = "Crashes in Chicago 2018") + # Remove axis labels and set title
theme_minimal() + # Minimal theme
theme(axis.text = element_blank(), # Remove axis text
axis.title = element_blank(), # Remove axis title
plot.title = element_text(hjust = 0.5)) # Center plot title

# Filter data for crash_year == 2019
crash_df_2019 <- subset(crash_df_, crash_year == 2019)
# Create scatter plot - 2019
ggplot(crash_df_2019, aes(x = longitude, y = latitude, size = crashes, color = has_injuries)) +
geom_point(alpha = 0.5) + # Add points with transparency
scale_size_continuous(range = c(0, 5)) + # Adjust the size range
labs(x = NULL, y = NULL, title = "Crashes in Chicago 2019") + # Remove axis labels and set title
theme_minimal() + # Minimal theme
theme(axis.text = element_blank(), # Remove axis text
axis.title = element_blank(), # Remove axis title
plot.title = element_text(hjust = 0.5)) # Center plot title

# Filter data for crash_year == 2020
crash_df_2020 <- subset(crash_df_, crash_year == 2020)
# Create scatter plot - 2019
ggplot(crash_df_2020, aes(x = longitude, y = latitude, size = crashes, color = has_injuries)) +
geom_point(alpha = 0.5) + # Add points with transparency
scale_size_continuous(range = c(0, 5)) + # Adjust the size range
labs(x = NULL, y = NULL, title = "Crashes in Chicago 2020") + # Remove axis labels and set title
theme_minimal() + # Minimal theme
theme(axis.text = element_blank(), # Remove axis text
axis.title = element_blank(), # Remove axis title
plot.title = element_text(hjust = 0.5)) # Center plot title
